WebPageSnapshotService.java example

Explorer
kite-spring-hbase-example-master
- src
  - main
    - java
      - org
        kitesdk
        spring
        hbase
        example
        controller
        WebPageSnapshotController.java
        helper
        DevHelper.java
        model
        converter
        WebPageSnapshotContentConverter.java
        WebPageSnapshotMetaConverter.java
        frontend
        WebPageSnapshotContent.java
        WebPageSnapshotMeta.java
        service
        WebPageSnapshotService.java
/**
 * Copyright 2014 Cloudera Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.kitesdk.spring.hbase.example.service;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.jsoup.Connection;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.kitesdk.data.DatasetReader;
import org.kitesdk.data.Key;
import org.kitesdk.data.RandomAccessDataset;
import org.kitesdk.spring.hbase.example.model.WebPageRedirectModel;
import org.kitesdk.spring.hbase.example.model.WebPageSnapshotModel;
import org.kitesdk.spring.hbase.example.model.frontend.WebPageSnapshotContent;
import org.kitesdk.spring.hbase.example.model.frontend.WebPageSnapshotMeta;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.convert.ConversionService;
import org.springframework.stereotype.Component;

/**
 * Service for WebPageSnapshot operations
 */
@Component
public class WebPageSnapshotService {

  @Autowired
  private RandomAccessDataset<WebPageSnapshotModel> webPageSnapshotModels;

  @Autowired
  private RandomAccessDataset<WebPageRedirectModel> webPageRedirectModels;

  @Autowired
  private ConversionService conversionService;

  /**
   * Take a snapshot of an URL. This WebPageSnapshot is stored in HBase. Returns
   * the WebPageSnapshotMeta
   * 
   * If the URL is a redirect, the snapshot is stored under the final URL
   * destination. A WebPageRedirectModel is stored in the redirect table so when
   * fetching snapshots, we can follow the proper redirect path.
   * 
   * @param url
   *          The URL to take a snapshot of
   * @return The WebPageSnapshotMeta for the page that we snapshotted.
   * @throws IOException
   */
  public WebPageSnapshotMeta takeSnapshot(String url) throws IOException {
    WebPageSnapshotModel webPageSnapshotModel = fetchWebPage(url);
    if (!webPageSnapshotModel.getUrl().equals(url)) {
      // Url is different, so must have redirected. Store the redirect model
      WebPageRedirectModel redirectModel = WebPageRedirectModel.newBuilder()
          .setUrl(url).setDestinationUrl(webPageSnapshotModel.getUrl()).build();
      webPageRedirectModels.put(redirectModel);
    } else {
      // If redirect exists, remove it since this URL no longer redirects
      Key key = new Key.Builder(webPageRedirectModels).add("url", url).build();
      WebPageRedirectModel redirectModel = webPageRedirectModels.get(key);
      if (redirectModel != null) {
        webPageRedirectModels.delete(key);
      }
    }
    webPageSnapshotModels.put(webPageSnapshotModel);
    return conversionService.convert(webPageSnapshotModel,
        WebPageSnapshotMeta.class);
  }

  /**
   * Get the most recent WebPageSnapshotMeta from HBase
   * 
   * @param url
   *          The URL of the WebPageSnapshotMeta to get from HBase.
   * @return The WebPageSnapshotMeta, or null if one doesn't exist for this URL.
   */
  public WebPageSnapshotMeta getWebPageSnapshotMeta(String url) {
    WebPageSnapshotModel model = getMostRecentWebPageSnapshot(url);
    if (model != null) {
      return conversionService.convert(model, WebPageSnapshotMeta.class);
    } else {
      return null;
    }
  }

  /**
   * Get the WebPageSnapshotMeta that was fetched at a particular timestamp from
   * HBase
   * 
   * @param url
   *          The URL of the WebPageSnapshotMeta to get from HBase.
   * @param ts
   *          The snapshot timestamp of the WebPageSnapshotMeta to get from
   *          HBase.
   * @return The WebPageSnapshotMeta, or null if one doesn't exist for this URL
   *         at this timestamp.
   */
  public WebPageSnapshotMeta getWebPageSnapshotMeta(String url, long ts) {
    WebPageSnapshotModel model = this.getWebPageSnapshot(url, ts);
    if (model != null) {
      return conversionService.convert(model, WebPageSnapshotMeta.class);
    } else {
      return null;
    }
  }

  /**
   * Get all WebPageSnapshotMeta from an URL that have been snapshotted since
   * the "since" param.
   * 
   * @param url
   *          The URL to get WebPageSnapshotMeta instances from
   * @param since
   *          The epoch timestamp
   * @return The list of WebPageSnapshotMeta instances.
   */
  public List<WebPageSnapshotMeta> getWebPageSnapshotMetaSince(String url,
      long since) {
    return convertList(getWebPageSnapshotsSince(url, since),
        WebPageSnapshotMeta.class);
  }

  /**
   * Get the most recent WebPageSnapshotContent from HBase
   * 
   * @param url
   *          The URL to fetch the most recent WebPageSnapshotContent from
   * @return The WebPageSnapshotContent, or null if one doesn't exists for this
   *         URL.
   */
  public WebPageSnapshotContent getWebPageSnapshotContent(String url) {
    WebPageSnapshotModel model = getMostRecentWebPageSnapshot(url);
    if (model != null) {
      return conversionService.convert(model, WebPageSnapshotContent.class);
    } else {
      return null;
    }
  }

  /**
   * Get the WebPageSnapshotContent that was fetched at a particular timestamp
   * from HBase
   * 
   * @param url
   *          The URL of the WebPageSnapshotContent to get from HBase.
   * @param ts
   *          The snapshot timestamp of the WebPageSnapshotContent to get from
   *          HBase.
   * @return The WebPageSnapshotContent, or null if one doesn't exist for this
   *         URL at this timestamp.
   */
  public WebPageSnapshotContent getWebPageSnapshotContent(String url, long ts) {
    WebPageSnapshotModel model = getWebPageSnapshot(url, ts);
    if (model != null) {
      return conversionService.convert(model, WebPageSnapshotContent.class);
    } else {
      return null;
    }
  }

  /**
   * Get all WebPageSnapshotContent from an URL that have been snapshotted since
   * the "since" param.
   * 
   * @param url
   *          The URL to get WebPageSnapshotContent instances from
   * @param since
   *          The epoch timestamp
   * @return The list of WebPageSnapshotContent instances.
   */
  public List<WebPageSnapshotContent> getWebPageSnapshotContentSince(
      String url, long since) {
    return convertList(getWebPageSnapshotsSince(url, since),
        WebPageSnapshotContent.class);
  }

  /**
   * Get the epoch timestamps for every snapshot time of an URL in HBase.
   * 
   * @param url
   *          The URL of the page to get snapshot timestamps for
   * @return The list of timestamps
   */
  public List<Long> getSnapshotTimestamps(String url) {
    url = normalizeUrl(url);
    List<Long> snapshotTimestamps = new ArrayList<Long>();
    DatasetReader<WebPageSnapshotModel> reader = null;
    try {
      reader = webPageSnapshotModels.from("url", url)
          .from("fetchedAtRevTs", 0L).to("url", url)
          .to("fetchedAtRevTs", Long.MAX_VALUE).newReader();
      while (reader.hasNext()) {
        snapshotTimestamps.add(reader.next().getFetchedAt());
      }
    } finally {
      if (reader != null) {
        reader.close();
      }
    }
    return snapshotTimestamps;
  }

  /**
   * Get the most recent WebPageSnapshotModel from HBase
   * 
   * @param url
   *          The URL to get the snapshotted page from HBase
   * @return The WebPageSnapshotModel, or null if there are no fetches for this
   *         URL
   */
  private WebPageSnapshotModel getMostRecentWebPageSnapshot(String url) {
    url = normalizeUrl(url);
    DatasetReader<WebPageSnapshotModel> reader = null;
    try {
      // we don't know the exact timestamp in the key, but we know since keys
      // are in timestamp descending order that the first row for an URL will be
      // the most recent.
      reader = webPageSnapshotModels.from("url", url)
          .from("fetchedAtRevTs", 0L).to("url", url)
          .to("fetchedAtRevTs", Long.MAX_VALUE).newReader();
      if (reader.hasNext()) {
        return reader.next();
      } else {
        return null;
      }
    } finally {
      if (reader != null) {
        reader.close();
      }
    }
  }

  /**
   * Get the WebPageSnapshotModel from HBase
   * 
   * @param url
   *          The URL of the WebPageSnapshotModel
   * @param ts
   *          The snapshot timestamp of the WebPageSnapshotModel
   * @return The WebPageSnapshotModel, or null if there is no snapshot for the
   *         URL at this timestamp.
   */
  private WebPageSnapshotModel getWebPageSnapshot(String url, long ts) {
    url = normalizeUrl(url);
    Key key = new Key.Builder(webPageSnapshotModels).add("url", url)
        .add("fetchedAtRevTs", Long.MAX_VALUE - ts).build();
    return webPageSnapshotModels.get(key);
  }

  /**
   * Get WebPageSnapshotModels for an URL from HBase since the since param.
   * 
   * @param url
   *          The URL of the page to fetch
   * @param since
   *          The models to fetch since
   * @return The list of models that have been fetched for an URL since the
   *         since param.
   */
  private List<WebPageSnapshotModel> getWebPageSnapshotsSince(String url,
      long since) {
    url = normalizeUrl(url);
    List<WebPageSnapshotModel> models = new ArrayList<WebPageSnapshotModel>();
    DatasetReader<WebPageSnapshotModel> reader = null;
    try {
      reader = webPageSnapshotModels.from("url", url)
          .from("fetchedAtRevTs", 0L).to("url", url)
          .to("fetchedAtRevTs", since).newReader();
      while (reader.hasNext()) {
        models.add(reader.next());
      }
    } finally {
      if (reader != null) {
        reader.close();
      }
    }
    return models;
  }

  /**
   * Normalize an URL, which currently only consists of returning a redirect
   * destination if an URL is a redirect, or otherwise the passed in url.
   * 
   * @param url
   *          The url to normalize
   * @return The normalized URL;
   */
  private String normalizeUrl(String url) {
    // If this url is a redirect, get it's destination URL to fetch from our
    // HBase store since we store all snapshots under the final destination the
    // page lives at.
    WebPageRedirectModel redirectModel = getRedirect(url);
    if (redirectModel != null) {
      return redirectModel.getDestinationUrl();
    } else {
      return url;
    }
  }

  /**
   * Return a WebPageRedirectModel if an URL is one that redirects to a
   * different source. Otherwise, returns null.
   * 
   * @return The WebPageRedirectModel
   */
  private WebPageRedirectModel getRedirect(String url) {
    Key key = new Key.Builder(webPageRedirectModels).add("url", url).build();
    return webPageRedirectModels.get(key);
  }

  /**
   * Fetch the web page from the URL, parse the HTML to populate the metadata
   * required by WebPageSnapshotModel, and return the constructed
   * WebPageSnapshotModel.
   * 
   * @param url
   *          The URL to fetch the web page from
   * @return The WebPageSnapshotModel
   * @throws IOException
   *           Thrown if there's an issue fetching the web page.
   */
  private WebPageSnapshotModel fetchWebPage(String url) throws IOException {
    long fetchTime = System.currentTimeMillis();
    Connection connection = Jsoup.connect(url);
    Response response = connection.execute();
    long postFetchTime = System.currentTimeMillis();
    int timeToFetch = (int) (postFetchTime - fetchTime);

    Document doc = response.parse();
    String destinationUrl = response.url().toString();
    String title = doc.title();
    String description = getDescriptionFromDocument(doc);
    List<String> keywords = getKeywordsFromDocument(doc);
    List<String> outlinks = getOutlinksFromDocument(doc);

    return WebPageSnapshotModel.newBuilder().setUrl(destinationUrl)
        .setFetchedAtRevTs(Long.MAX_VALUE - fetchTime)
        .setSize(doc.html().length()).setFetchedAt(fetchTime)
        .setFetchTimeMs(timeToFetch).setTitle(title)
        .setDescription(description).setKeywords(keywords)
        .setOutlinks(outlinks).setContent(doc.html()).build();
  }

  /**
   * Parse the description out of the meta tag if one exists. Otherwise, return
   * null
   * 
   * @param doc
   *          The Document to parse
   * @return The description if it exists in the HTML, otherwise null.
   */
  private String getDescriptionFromDocument(Document doc) {
    Elements metaDescriptionElements = doc.select("meta[name=description]");
    return metaDescriptionElements.size() > 0 ? metaDescriptionElements
        .attr("content") : "";
  }

  /**
   * Parse the keywords out of the meta tag if one exists. Otherwise, return an
   * empty list.
   * 
   * @param doc
   *          The Document ot parse
   * @return The list of keywords.
   */
  private List<String> getKeywordsFromDocument(Document doc) {
    List<String> keywords = new ArrayList<String>();
    Elements keywordsElements = doc.select("meta[name=keywords]");
    for (Element keywordsElement : keywordsElements) {
      for (String keyword : keywordsElement.attr("content").split(",")) {
        keywords.add(keyword.trim());
      }
    }
    return keywords;
  }

  /**
   * Parse the outlinks from a href tags in the document, and return them as a
   * list
   * 
   * @param doc
   *          The document to parse
   * @return The list of outlinks as URL strings.
   */
  private List<String> getOutlinksFromDocument(Document doc) {
    List<String> outlinks = new ArrayList<String>();
    Elements linkElements = doc.select("a[href]");
    for (Element linkElement : linkElements) {
      outlinks.add(linkElement.attr("href").trim());
    }
    return outlinks;
  }

  /**
   * Use the conversionService to convert a list of objects to clazz
   * 
   * @param list
   *          The list of objects to convert
   * @param clazz
   *          The class to convert those objects to
   * @return The list of converted objects.
   */
  private <T> List<T> convertList(List<?> list, Class<T> clazz) {
    List<T> returnList = new ArrayList<T>();
    for (Object o : list) {
      returnList.add(conversionService.convert(o, clazz));
    }
    return returnList;
  }
}